### extract a breakdown of collection classifications

# Load packages

import pyspark.sql.functions as sqlf
import pandas as pd
import numpy  as np
import os
import sys
import shutil
from pyspark     import SparkContext, SparkConf
from pyspark.sql import SQLContext
from SparkDataTools2 import *


# Set spark working path
WORKDIR = '/geo_debt/geo/Spark'
CODEDIR = '/geo_debt/geo/Code/1_Spark/'
OUTDIR  = '/geo_debt/geo/Output/Table'

#go to owrking directory
os.chdir(WORKDIR)

#########################
###       Main        ###
#########################

yList = range(2015, 2015)
m   = 6


for y in yList:


    collectionsData = table_setup(sqlContext, y, m, 'collections')
    collectionsData.createOrReplaceTempView("collectionsData")

    ym = yyyymm(y,m)


    ## collection classification
    coll = "(SELECT currentBalanceAmount, collectionKey, "+\
           "CASE WHEN prohibitedCode=2 THEN 2 " +\
           "WHEN creditorClassificiationCode IS NOT NULL THEN creditorClassificiationCode "+\
           "ELSE 999 END AS creditorClassificiationCode "+\
           "FROM collectionsData " +\
           "WHERE highCreditAmount >100 AND mannerOfPayment='9B') "

    sum_bal = "(SELECT SUM(currentBalanceAmount) FROM " + coll + ") "

    tot_count = "(SELECT COUNT(collectionKey) FROM " + coll + ") "

    sql0 = "(SELECT creditorClassificiationCode, 100*SUM(currentBalanceAmount)/" + sum_bal + " AS pct_current_balance, "+\
          "100*COUNT(collectionKey)/" + tot_count + " AS pct_num_of_account, "+\
          "AVG(currentBalanceAmount) AS avg_balance "+\
          "FROM " + coll + " " +\
          "GROUP BY creditorClassificiationCode " + \
          "ORDER BY creditorClassificiationCode) "

    # rename
    sql = "SELECT CASE "+\
            "WHEN creditorClassificiationCode=1 THEN 'Retail' " +\
            "WHEN creditorClassificiationCode=2 THEN 'Medical/Health Care' " +\
            "WHEN creditorClassificiationCode=3 THEN 'Oil Company' " +\
            "WHEN creditorClassificiationCode=4 THEN 'Government' " +\
            "WHEN creditorClassificiationCode=5 THEN 'Personal Service' " +\
            "WHEN creditorClassificiationCode=6 THEN 'Insurance' " +\
            "WHEN creditorClassificiationCode=7 THEN 'Educational' " +\
            "WHEN creditorClassificiationCode=8 THEN 'Banking' " +\
            "WHEN creditorClassificiationCode=9 THEN 'Rental/Leasing' " +\
            "WHEN creditorClassificiationCode=10 THEN 'Utilities' " +\
            "WHEN creditorClassificiationCode=11 THEN 'Cable/Cellular' " +\
            "WHEN creditorClassificiationCode=12 THEN 'Financial' " +\
            "WHEN creditorClassificiationCode=13 THEN 'Credit Union' " +\
            "WHEN creditorClassificiationCode=14 THEN 'Automotive' " +\
            "WHEN creditorClassificiationCode=15 THEN 'Check Guarantee' " +\
            "WHEN creditorClassificiationCode=999 THEN 'NA' " +\
            "ELSE creditorClassificiationCode END AS creditors, "+\
            "pct_current_balance, pct_num_of_account, avg_balance " +\
            "FROM " + sql0 + " "
    # sqlContext.sql(sql).show()
    saveStata(sql,'coll_'+ym, WORKDIR, OUTDIR)


sqlContext.clearCache()

print(' !!!! SUCCESS !!!!')
